001 /* 002 * PairwiseAlignmentAlgorithm.java 003 * 004 * Copyright 2003 Sergio Anibal de Carvalho Junior 005 * 006 * This file is part of NeoBio. 007 * 008 * NeoBio is free software; you can redistribute it and/or modify it under the terms of 009 * the GNU General Public License as published by the Free Software Foundation; either 010 * version 2 of the License, or (at your option) any later version. 011 * 012 * NeoBio is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; 013 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR 014 * PURPOSE. See the GNU General Public License for more details. 015 * 016 * You should have received a copy of the GNU General Public License along with NeoBio; 017 * if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, 018 * Boston, MA 02111-1307, USA. 019 * 020 * Proper attribution of the author as the source of the software would be appreciated. 021 * 022 * Sergio Anibal de Carvalho Junior mailto:sergioanibaljr@users.sourceforge.net 023 * Department of Computer Science http://www.dcs.kcl.ac.uk 024 * King's College London, UK http://www.kcl.ac.uk 025 * 026 * Please visit http://neobio.sourceforge.net 027 * 028 * This project was supervised by Professor Maxime Crochemore. 029 * 030 */ 031 032 package neobio.alignment; 033 034 import java.io.Reader; 035 import java.io.IOException; 036 037 /** 038 * This abstract class is the superclass of all classes implementing pairwise sequence 039 * alignment algorithms. Subclasses are required to provide methods to build a high 040 * scoring alignment between two sequences and compute its score with a given scoring 041 * scheme. 042 * 043 * <P>Clients are required to set a scoring scheme and load two sequences before 044 * requesting an alignment or the computation of its score. They typically make the 045 * following sequence of method calls:</P> 046 * 047 * <CODE><BLOCKQUOTE><PRE> 048 * // prepare 049 * PairwiseAlignmentAlgorithm algorithm = new SomePairwiseAlignmentAlgorith (); 050 * algorithm.setScoringScheme (some_scoring_scheme); 051 * algorithm.loadSequences (sequence1, sequence2); 052 * 053 * // now compute the alignment 054 * PairwiseAlignment alignment = algorithm.getPairwiseAlignment(); 055 * int score = algorithm.getScore(); 056 * </PRE></BLOCKQUOTE></CODE> 057 * 058 * @author Sergio A. de Carvalho Jr. 059 * @see PairwiseAlignment 060 */ 061 public abstract class PairwiseAlignmentAlgorithm 062 { 063 /** 064 * Tag character that signals a match in the score tag line of an alignment. Its use 065 * is conditioned by the <CODE>use_match_tag</CODE> flag. 066 * 067 * @see #use_match_tag 068 * @see #useMatchTag 069 */ 070 protected static final char MATCH_TAG = '|'; 071 072 /** 073 * Tag character that signals an approximate match in the score tag line of an 074 * alignment. 075 */ 076 protected static final char APPROXIMATE_MATCH_TAG = '+'; 077 078 /** 079 * Character that signals a mismatch in the score tag line of an alignment. 080 */ 081 protected static final char MISMATCH_TAG = ' '; 082 083 /** 084 * Character that signals a gap in the score tag line of an alignment. 085 */ 086 protected static final char GAP_TAG = ' '; 087 088 /** 089 * Character that signals a gap in sequence. 090 */ 091 protected static final char GAP_CHARACTER = '-'; 092 093 /** 094 * Indicates if the <CODE>MATCH_TAG</CODE> tag should be used or not. If it is 095 * <CODE>true</CODE>, the alignment algorithm should write the <CODE>MATCH_TAG</CODE> 096 * tag in the score tag line of the alignment whenever a match occurs between 097 * characters of the two sequences. If it is <CODE>false</CODE> the matching character 098 * should be written instead. This flag is updated whenever a scoring scheme is set to 099 * this <CODE>PairwiseAlignmentAlgorithm</CODE> by the <CODE>setScoringScheme</CODE> 100 * method. 101 * 102 * @see #MATCH_TAG 103 * @see #useMatchTag 104 * @see #setScoringScheme 105 */ 106 protected boolean use_match_tag; 107 108 /** 109 * The scoring scheme used to compute a pairwise sequence alignment. It must be set 110 * before performing the alignment, and if a new scoring scheme is set, any alignment 111 * or score already computed is lost. 112 */ 113 protected ScoringScheme scoring; 114 115 /** 116 * Stores the product of the last pairwise alignment performed. It contains a string 117 * representation of a highest scoring alignment between the two sequences and its 118 * score. It is set after a successful execution of the 119 * <CODE>computePairwiseAlignment</CODE> method that subclasses must implement. It is 120 * set to null if new sequences are loaded or a new scoring scheme is set. 121 */ 122 protected PairwiseAlignment alignment; 123 124 /** 125 * This field stores just the score of the last pairwise alignment performed (if the 126 * <CODE>score_computed flag</CODE> is set to true). It is useful when just the score 127 * is needed (and not the alignment itselft). Its value is set after a successful 128 * execution of both <CODE>computePairwiseAlignment</CODE> or 129 * <CODE>computeScore</CODE> methods that subclasses must implement. If new sequences 130 * are loaded or a new scoring scheme is set, the <CODE>score_computed</CODE> flag is 131 * set to false, and this field's value becomes undefined. 132 */ 133 protected int score; 134 135 /** 136 * Flags whether the score of the alignment between the last two loaded sequences has 137 * already been computed. It is set to true after a successful execution of both 138 * <CODE>computePairwiseAlignment</CODE> or <CODE>computeScore</CODE> methods that 139 * subclasses must implement. It is set to falsef if new sequences are loaded or a new 140 * scoring scheme is set. 141 */ 142 protected boolean score_computed = false; 143 144 /** 145 * Flags whether sequences have been loaded. It is set to true after subclasses 146 * successfully load two sequences. 147 */ 148 protected boolean sequences_loaded = false; 149 150 /** 151 * Sets the scoring scheme to be used for the next alignments. Any alignment or score 152 * already computed is lost. If the scoring scheme supports partial matches, this 153 * <CODE>PairwiseAlignmentAlgorithm</CODE> is set not to use the 154 * <CODE>MATCH_TAG</CODE> tag because in this case the score tag line be confusing. 155 * If the scoring scheme does not support partial matches, then the use of the 156 * <CODE>MATCH_TAG</CODE> tag is enabled. 157 * 158 * @param scoring Scoring scheme to be used 159 * @see #MATCH_TAG 160 * @see ScoringScheme#isPartialMatchSupported 161 */ 162 public void setScoringScheme (ScoringScheme scoring) 163 { 164 if (scoring == null) 165 throw new IllegalArgumentException ("Null scoring scheme object."); 166 167 this.scoring = scoring; 168 169 // if the scoring scheme supports partial matches, 170 // disable the use of the MATCH_TAG character 171 if (scoring.isPartialMatchSupported()) 172 this.use_match_tag = false; 173 else 174 this.use_match_tag = true; 175 176 // when a new scoring scheme is set, 177 // the alignment needs to be recomputed 178 this.alignment = null; 179 this.score_computed = false; 180 } 181 182 /** 183 * Tells wether the <CODE>MATCH_TAG</CODE> tag should be used or not. If it returns 184 * <CODE>true</CODE>, the alignment algorithm should write the <CODE>MATCH_TAG</CODE> 185 * tag in the score tag line of the alignment produced whenever a match occurs between 186 * characters of the two sequences. If it returns <CODE>false</CODE> the matching 187 * character should be written instead. The value returned is conditioned by the 188 * <CODE>use_match_tag</CODE> flag, which is updated whenever a scoring scheme is set 189 * to this <CODE>PairwiseAlignmentAlgorithm</CODE> by the 190 * <CODE>setScoringScheme</CODE> method. 191 * 192 * @return <CODE>true</CODE if the <CODE>MATCH_TAG</CODE> tag should be used, 193 * <CODE>false</CODE> otherwise 194 * @see #MATCH_TAG 195 * @see #use_match_tag 196 * @see #setScoringScheme 197 */ 198 protected boolean useMatchTag () 199 { 200 return use_match_tag; 201 } 202 203 /** 204 * Request subclasses to load the sequences according to their own needs. Any 205 * alignment and score already computed is lost. If no exception is raised, the loaded 206 * flag is set to true. Subclasses typically store the sequences in instances of an 207 * appropiate class and each can have its own contract, so check each algorithm to see 208 * what kind of sequences it produces. Input can come from any source provided they 209 * are encapsulated with a proper Reader. They must be ready to be read, i.e. the next 210 * read operation must return the sequence's first character. 211 * 212 * @param input1 First sequence 213 * @param input2 Second sequence 214 * @throws IOException If an I/O error occurs when reading the sequences 215 * @throws InvalidSequenceException If the sequences are not valid 216 */ 217 public void loadSequences (Reader input1, Reader input2) 218 throws IOException, InvalidSequenceException 219 { 220 // when new sequences are loaded, the 221 // alignment and score needs to be recomputed 222 this.alignment = null; 223 this.score_computed = false; 224 225 // make sure that if an exception is raised 226 // the sequences_loaded flag is false 227 this.sequences_loaded = false; 228 229 // request subclasses to load sequences 230 loadSequencesInternal (input1, input2); 231 232 // if no exception is raised, 233 // set the loaded flag to true 234 this.sequences_loaded = true; 235 } 236 237 /** 238 * Frees pointer to loaded sequences and computed alignments (if any) so that their 239 * data can be garbage collected. 240 */ 241 public void unloadSequences () 242 { 243 // allow any alignment already computed 244 // to be garbage collected 245 this.alignment = null; 246 this.score_computed = false; 247 248 // request subclasses to unload sequences 249 unloadSequencesInternal (); 250 251 this.sequences_loaded = false; 252 } 253 254 /** 255 * Return the last pairwise alignment computed (if any) or request subclasses to 256 * compute one and return the result by calling the 257 * <CODE>computePairwiseAlignment</CODE> method. The sequences must already be loaded 258 * and a scoring scheme must already be set. 259 * 260 * @return a pairwise alignment between the loaded sequences 261 * @throws IncompatibleScoringSchemeException If the scoring scheme 262 * is not compatible with the loaded sequences 263 * @see #computePairwiseAlignment 264 */ 265 public PairwiseAlignment getPairwiseAlignment () 266 throws IncompatibleScoringSchemeException 267 { 268 if (!sequences_loaded) 269 throw new IllegalStateException ("Sequences have not been loaded."); 270 271 if (scoring == null) 272 throw new IllegalStateException ("Scoring scheme has not been set."); 273 274 if (this.alignment == null) 275 { 276 // make sure the scoring scheme won't be changed 277 // in the middle of the alignment computation 278 synchronized (scoring) 279 { 280 // compute the alignment if it hasn't been computed yet 281 this.alignment = computePairwiseAlignment(); 282 } 283 284 // store the score as well 285 this.score = this.alignment.getScore(); 286 this.score_computed = true; 287 } 288 289 return this.alignment; 290 } 291 292 /** 293 * Returns the score of the last alignment computed (if any) or request subclasses to 294 * compute one and return the result by calling the <CODE>computeScore</CODE> method. 295 * The sequences must already be loaded and a scoring scheme must already be set. 296 * 297 * @return the score of the alignment between the loaded sequences 298 * @throws IncompatibleScoringSchemeException If the scoring scheme 299 * is not compatible with the loaded sequences 300 * @see #computeScore 301 */ 302 public int getScore () throws IncompatibleScoringSchemeException 303 { 304 if (!sequences_loaded) 305 throw new IllegalStateException ("Sequences have not been loaded."); 306 307 if (scoring == null) 308 throw new IllegalStateException ("Scoring scheme has not been set."); 309 310 if (!score_computed) 311 { 312 // make sure the scoring scheme won't be changed 313 // in the middle of the alignment computation 314 synchronized (scoring) 315 { 316 // compute the alignment's score if it hasn't been computed yet 317 this.score = computeScore(); 318 } 319 320 this.score_computed = true; 321 } 322 323 return this.score; 324 } 325 326 /** 327 * Subclasses must implement this method to load sequences according to their own 328 * needs and throw an exception in case of any failure. If no exception is raised, the 329 * loaded flag is set to true by the public method and the sequences are believed to 330 * be loaded (so an alignment or score can be requested). 331 * 332 * @param input1 First sequence 333 * @param input2 Second sequence 334 * @throws IOException If an I/O error occurs when reading the sequences 335 * @throws InvalidSequenceException If the sequences are not valid 336 * @see #loadSequences 337 * @see CharSequence 338 * @see FactorSequence 339 */ 340 protected abstract void loadSequencesInternal (Reader input1, Reader input2) 341 throws IOException, InvalidSequenceException; 342 343 /** 344 * Subclasses must implement this method to unload sequences according to their own 345 * storage, freeing pointers to sequences and any intermediate data so that they can 346 * be garbage collected. This methid is called by the public 347 * <CODE>unloadSequences</CODE> method. 348 * 349 * @see #unloadSequences 350 */ 351 protected abstract void unloadSequencesInternal (); 352 353 /** 354 * Subclasses must implement this method to compute an alignment between the loaded 355 * sequences using the scoring scheme previously set. This methid is called by the 356 * <CODE>getPairwiseAlignment</CODE> method when needed. 357 * 358 * @return a pairwise alignment between the loaded sequences 359 * @throws IncompatibleScoringSchemeException If the scoring scheme 360 * is not compatible with the loaded sequences 361 * @see #getPairwiseAlignment 362 */ 363 protected abstract PairwiseAlignment computePairwiseAlignment () 364 throws IncompatibleScoringSchemeException; 365 366 /** 367 * Subclasses must implement this method to compute the score of the alignment between 368 * the loaded sequences using the scoring scheme previously set. This methid is called 369 * by the <CODE>getScore</CODE> method when needed. 370 * 371 * @return the score of the alignment between the loaded sequences 372 * @throws IncompatibleScoringSchemeException If the scoring scheme 373 * is not compatible with the loaded sequences 374 * @see #getScore 375 */ 376 protected abstract int computeScore () throws IncompatibleScoringSchemeException; 377 378 /** 379 * Helper method to invoke the <CODE>scoreSubstitution</CODE> method of the scoring 380 * scheme set to this algorithm. 381 * 382 * @param a first character 383 * @param b second character 384 * @return score of substitution of <CODE>a</CODE> for <CODE>b</CODE> 385 * @throws IncompatibleScoringSchemeException if the scoring scheme is not compatible 386 * with the sequences being aligned 387 * @see ScoringScheme#scoreSubstitution 388 */ 389 protected final int scoreSubstitution (char a, char b) 390 throws IncompatibleScoringSchemeException 391 { 392 return scoring.scoreSubstitution (a, b); 393 } 394 395 /** 396 * Helper method to invoke the <CODE>scoreInsertion</CODE> method of the scoring 397 * scheme set to this algorithm. 398 * 399 * @param a the character to be inserted 400 * @return score of insertion of <CODE>a</CODE> 401 * @throws IncompatibleScoringSchemeException if the scoring scheme is not compatible 402 * with the sequences being aligned 403 * @see ScoringScheme#scoreInsertion 404 */ 405 protected final int scoreInsertion (char a) throws IncompatibleScoringSchemeException 406 { 407 return scoring.scoreInsertion (a); 408 } 409 410 /** 411 * Helper method to invoke the <CODE>scoreDeletion</CODE> method of the scoring scheme 412 * set to this algorithm. 413 * 414 * @param a the character to be deleted 415 * @return score of deletion of <CODE>a</CODE> 416 * @throws IncompatibleScoringSchemeException if the scoring scheme is not compatible 417 * with the sequences being aligned 418 * @see ScoringScheme#scoreDeletion 419 */ 420 protected final int scoreDeletion (char a) throws IncompatibleScoringSchemeException 421 { 422 return scoring.scoreDeletion (a); 423 } 424 425 /** 426 * Helper method to compute the the greater of two values. 427 * 428 * @param v1 first value 429 * @param v2 second value 430 * @return the larger of <CODE>v1</CODE> and <CODE>v2</CODE> 431 */ 432 protected final int max (int v1, int v2) 433 { 434 return (v1 >= v2) ? v1 : v2; 435 } 436 437 /** 438 * Helper method to compute the the greater of three values. 439 * 440 * @param v1 first value 441 * @param v2 second value 442 * @param v3 third value 443 * @return the larger of <CODE>v1</CODE>, <CODE>v2</CODE> and <CODE>v3</CODE> 444 */ 445 protected final int max (int v1, int v2, int v3) 446 { 447 return (v1 >= v2) ? ((v1 >= v3)? v1 : v3) : ((v2 >= v3)? v2 : v3); 448 } 449 450 /** 451 * Helper method to compute the the greater of four values. 452 * 453 * @param v1 first value 454 * @param v2 second value 455 * @param v3 third value 456 * @param v4 fourth value 457 * @return the larger of <CODE>v1</CODE>, <CODE>v2</CODE> <CODE>v3</CODE> and 458 * <CODE>v4</CODE> 459 */ 460 protected final int max (int v1, int v2, int v3, int v4) 461 { 462 int m1 = ((v1 >= v2) ? v1 : v2); 463 int m2 = ((v3 >= v4) ? v3 : v4); 464 465 return (m1 >= m2) ? m1 : m2; 466 } 467 }